library(data.table)
library(dplyr)
library(lubridate)
library(leaflet)
library(pals)
library(ggplot2)
aqi2004 <- data.table::fread('C:/Users/Brandyn Ruiz/OneDrive/USC/PM566/Assignment1/aqi2004.csv')
dim(aqi2004)
## [1] 19233 20
names(aqi2004)
## [1] "Date" "Source"
## [3] "Site ID" "POC"
## [5] "Daily Mean PM2.5 Concentration" "UNITS"
## [7] "DAILY_AQI_VALUE" "Site Name"
## [9] "DAILY_OBS_COUNT" "PERCENT_COMPLETE"
## [11] "AQS_PARAMETER_CODE" "AQS_PARAMETER_DESC"
## [13] "CBSA_CODE" "CBSA_NAME"
## [15] "STATE_CODE" "STATE"
## [17] "COUNTY_CODE" "COUNTY"
## [19] "SITE_LATITUDE" "SITE_LONGITUDE"
str(aqi2004)
## Classes 'data.table' and 'data.frame': 19233 obs. of 20 variables:
## $ Date : chr "01/01/2004" "01/02/2004" "01/03/2004" "01/04/2004" ...
## $ Source : chr "AQS" "AQS" "AQS" "AQS" ...
## $ Site ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
## $ POC : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Daily Mean PM2.5 Concentration: num 8.9 12.2 16.5 19.5 11.5 32.5 15.5 29.9 21 16.9 ...
## $ UNITS : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
## $ DAILY_AQI_VALUE : int 37 51 60 67 48 94 58 88 70 61 ...
## $ Site Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
## $ DAILY_OBS_COUNT : int 1 1 1 1 1 1 1 1 1 1 ...
## $ PERCENT_COMPLETE : num 100 100 100 100 100 100 100 100 100 100 ...
## $ AQS_PARAMETER_CODE : int 88101 88502 88502 88502 88502 88502 88502 88502 88502 88502 ...
## $ AQS_PARAMETER_DESC : chr "PM2.5 - Local Conditions" "Acceptable PM2.5 AQI & Speciation Mass" "Acceptable PM2.5 AQI & Speciation Mass" "Acceptable PM2.5 AQI & Speciation Mass" ...
## $ CBSA_CODE : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
## $ CBSA_NAME : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
## $ STATE_CODE : int 6 6 6 6 6 6 6 6 6 6 ...
## $ STATE : chr "California" "California" "California" "California" ...
## $ COUNTY_CODE : int 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ SITE_LATITUDE : num 37.7 37.7 37.7 37.7 37.7 ...
## $ SITE_LONGITUDE : num -122 -122 -122 -122 -122 ...
## - attr(*, ".internal.selfref")=<externalptr>
head(aqi2004)
## Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
## 1: 01/01/2004 AQS 60010007 1 8.9 ug/m3 LC
## 2: 01/02/2004 AQS 60010007 1 12.2 ug/m3 LC
## 3: 01/03/2004 AQS 60010007 1 16.5 ug/m3 LC
## 4: 01/04/2004 AQS 60010007 1 19.5 ug/m3 LC
## 5: 01/05/2004 AQS 60010007 1 11.5 ug/m3 LC
## 6: 01/06/2004 AQS 60010007 1 32.5 ug/m3 LC
## DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1: 37 Livermore 1 100
## 2: 51 Livermore 1 100
## 3: 60 Livermore 1 100
## 4: 67 Livermore 1 100
## 5: 48 Livermore 1 100
## 6: 94 Livermore 1 100
## AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
## 1: 88101 PM2.5 - Local Conditions 41860
## 2: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 3: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 4: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 5: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 6: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## CBSA_NAME STATE_CODE STATE COUNTY_CODE COUNTY
## 1: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 2: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 3: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 4: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 5: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 6: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## SITE_LATITUDE SITE_LONGITUDE
## 1: 37.68753 -121.7842
## 2: 37.68753 -121.7842
## 3: 37.68753 -121.7842
## 4: 37.68753 -121.7842
## 5: 37.68753 -121.7842
## 6: 37.68753 -121.7842
tail(aqi2004)
## Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
## 1: 12/14/2004 AQS 61131003 1 11 ug/m3 LC
## 2: 12/17/2004 AQS 61131003 1 16 ug/m3 LC
## 3: 12/20/2004 AQS 61131003 1 17 ug/m3 LC
## 4: 12/23/2004 AQS 61131003 1 9 ug/m3 LC
## 5: 12/26/2004 AQS 61131003 1 24 ug/m3 LC
## 6: 12/29/2004 AQS 61131003 1 9 ug/m3 LC
## DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1: 46 Woodland-Gibson Road 1 100
## 2: 59 Woodland-Gibson Road 1 100
## 3: 61 Woodland-Gibson Road 1 100
## 4: 38 Woodland-Gibson Road 1 100
## 5: 76 Woodland-Gibson Road 1 100
## 6: 38 Woodland-Gibson Road 1 100
## AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
## 1: 88101 PM2.5 - Local Conditions 40900
## 2: 88101 PM2.5 - Local Conditions 40900
## 3: 88101 PM2.5 - Local Conditions 40900
## 4: 88101 PM2.5 - Local Conditions 40900
## 5: 88101 PM2.5 - Local Conditions 40900
## 6: 88101 PM2.5 - Local Conditions 40900
## CBSA_NAME STATE_CODE STATE COUNTY_CODE
## 1: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 2: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 3: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 4: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 5: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 6: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## COUNTY SITE_LATITUDE SITE_LONGITUDE
## 1: Yolo 38.66121 -121.7327
## 2: Yolo 38.66121 -121.7327
## 3: Yolo 38.66121 -121.7327
## 4: Yolo 38.66121 -121.7327
## 5: Yolo 38.66121 -121.7327
## 6: Yolo 38.66121 -121.7327
In our 2004 dataset we have 19233 observations with 20 variables.
aqi2019 <- data.table::fread('C:/Users/Brandyn Ruiz/OneDrive/USC/PM566/Assignment1/aqi2019.csv')
dim(aqi2019)
## [1] 53328 20
names(aqi2019)
## [1] "Date" "Source"
## [3] "Site ID" "POC"
## [5] "Daily Mean PM2.5 Concentration" "UNITS"
## [7] "DAILY_AQI_VALUE" "Site Name"
## [9] "DAILY_OBS_COUNT" "PERCENT_COMPLETE"
## [11] "AQS_PARAMETER_CODE" "AQS_PARAMETER_DESC"
## [13] "CBSA_CODE" "CBSA_NAME"
## [15] "STATE_CODE" "STATE"
## [17] "COUNTY_CODE" "COUNTY"
## [19] "SITE_LATITUDE" "SITE_LONGITUDE"
str(aqi2019)
## Classes 'data.table' and 'data.frame': 53328 obs. of 20 variables:
## $ Date : chr "01/01/2019" "01/02/2019" "01/03/2019" "01/04/2019" ...
## $ Source : chr "AQS" "AQS" "AQS" "AQS" ...
## $ Site ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
## $ POC : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Daily Mean PM2.5 Concentration: num 5.7 11.9 20.1 28.8 11.2 2.7 2.8 7 3.1 7.1 ...
## $ UNITS : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
## $ DAILY_AQI_VALUE : int 24 50 68 86 47 11 12 29 13 30 ...
## $ Site Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
## $ DAILY_OBS_COUNT : int 1 1 1 1 1 1 1 1 1 1 ...
## $ PERCENT_COMPLETE : num 100 100 100 100 100 100 100 100 100 100 ...
## $ AQS_PARAMETER_CODE : int 88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
## $ AQS_PARAMETER_DESC : chr "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
## $ CBSA_CODE : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
## $ CBSA_NAME : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
## $ STATE_CODE : int 6 6 6 6 6 6 6 6 6 6 ...
## $ STATE : chr "California" "California" "California" "California" ...
## $ COUNTY_CODE : int 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ SITE_LATITUDE : num 37.7 37.7 37.7 37.7 37.7 ...
## $ SITE_LONGITUDE : num -122 -122 -122 -122 -122 ...
## - attr(*, ".internal.selfref")=<externalptr>
head(aqi2019)
## Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
## 1: 01/01/2019 AQS 60010007 3 5.7 ug/m3 LC
## 2: 01/02/2019 AQS 60010007 3 11.9 ug/m3 LC
## 3: 01/03/2019 AQS 60010007 3 20.1 ug/m3 LC
## 4: 01/04/2019 AQS 60010007 3 28.8 ug/m3 LC
## 5: 01/05/2019 AQS 60010007 3 11.2 ug/m3 LC
## 6: 01/06/2019 AQS 60010007 3 2.7 ug/m3 LC
## DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1: 24 Livermore 1 100
## 2: 50 Livermore 1 100
## 3: 68 Livermore 1 100
## 4: 86 Livermore 1 100
## 5: 47 Livermore 1 100
## 6: 11 Livermore 1 100
## AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
## 1: 88101 PM2.5 - Local Conditions 41860
## 2: 88101 PM2.5 - Local Conditions 41860
## 3: 88101 PM2.5 - Local Conditions 41860
## 4: 88101 PM2.5 - Local Conditions 41860
## 5: 88101 PM2.5 - Local Conditions 41860
## 6: 88101 PM2.5 - Local Conditions 41860
## CBSA_NAME STATE_CODE STATE COUNTY_CODE COUNTY
## 1: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 2: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 3: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 4: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 5: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 6: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## SITE_LATITUDE SITE_LONGITUDE
## 1: 37.68753 -121.7842
## 2: 37.68753 -121.7842
## 3: 37.68753 -121.7842
## 4: 37.68753 -121.7842
## 5: 37.68753 -121.7842
## 6: 37.68753 -121.7842
tail(aqi2019)
## Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
## 1: 11/11/2019 AQS 61131003 1 13.5 ug/m3 LC
## 2: 11/17/2019 AQS 61131003 1 18.1 ug/m3 LC
## 3: 11/29/2019 AQS 61131003 1 12.5 ug/m3 LC
## 4: 12/17/2019 AQS 61131003 1 23.8 ug/m3 LC
## 5: 12/23/2019 AQS 61131003 1 1.0 ug/m3 LC
## 6: 12/29/2019 AQS 61131003 1 9.1 ug/m3 LC
## DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1: 54 Woodland-Gibson Road 1 100
## 2: 64 Woodland-Gibson Road 1 100
## 3: 52 Woodland-Gibson Road 1 100
## 4: 76 Woodland-Gibson Road 1 100
## 5: 4 Woodland-Gibson Road 1 100
## 6: 38 Woodland-Gibson Road 1 100
## AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
## 1: 88101 PM2.5 - Local Conditions 40900
## 2: 88101 PM2.5 - Local Conditions 40900
## 3: 88101 PM2.5 - Local Conditions 40900
## 4: 88101 PM2.5 - Local Conditions 40900
## 5: 88101 PM2.5 - Local Conditions 40900
## 6: 88101 PM2.5 - Local Conditions 40900
## CBSA_NAME STATE_CODE STATE COUNTY_CODE
## 1: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 2: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 3: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 4: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 5: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 6: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## COUNTY SITE_LATITUDE SITE_LONGITUDE
## 1: Yolo 38.66121 -121.7327
## 2: Yolo 38.66121 -121.7327
## 3: Yolo 38.66121 -121.7327
## 4: Yolo 38.66121 -121.7327
## 5: Yolo 38.66121 -121.7327
## 6: Yolo 38.66121 -121.7327
In our 2019 dataset we have 53328 observations with 20 variables. There are more records in 2019 than there are in 2004 and could possibly be due to more sites.
join <- full_join(aqi2004, aqi2019)
head(join)
## Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
## 1: 01/01/2004 AQS 60010007 1 8.9 ug/m3 LC
## 2: 01/02/2004 AQS 60010007 1 12.2 ug/m3 LC
## 3: 01/03/2004 AQS 60010007 1 16.5 ug/m3 LC
## 4: 01/04/2004 AQS 60010007 1 19.5 ug/m3 LC
## 5: 01/05/2004 AQS 60010007 1 11.5 ug/m3 LC
## 6: 01/06/2004 AQS 60010007 1 32.5 ug/m3 LC
## DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1: 37 Livermore 1 100
## 2: 51 Livermore 1 100
## 3: 60 Livermore 1 100
## 4: 67 Livermore 1 100
## 5: 48 Livermore 1 100
## 6: 94 Livermore 1 100
## AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
## 1: 88101 PM2.5 - Local Conditions 41860
## 2: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 3: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 4: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 5: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 6: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## CBSA_NAME STATE_CODE STATE COUNTY_CODE COUNTY
## 1: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 2: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 3: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 4: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 5: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 6: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## SITE_LATITUDE SITE_LONGITUDE
## 1: 37.68753 -121.7842
## 2: 37.68753 -121.7842
## 3: 37.68753 -121.7842
## 4: 37.68753 -121.7842
## 5: 37.68753 -121.7842
## 6: 37.68753 -121.7842
tail(join)
## Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
## 1: 11/11/2019 AQS 61131003 1 13.5 ug/m3 LC
## 2: 11/17/2019 AQS 61131003 1 18.1 ug/m3 LC
## 3: 11/29/2019 AQS 61131003 1 12.5 ug/m3 LC
## 4: 12/17/2019 AQS 61131003 1 23.8 ug/m3 LC
## 5: 12/23/2019 AQS 61131003 1 1.0 ug/m3 LC
## 6: 12/29/2019 AQS 61131003 1 9.1 ug/m3 LC
## DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1: 54 Woodland-Gibson Road 1 100
## 2: 64 Woodland-Gibson Road 1 100
## 3: 52 Woodland-Gibson Road 1 100
## 4: 76 Woodland-Gibson Road 1 100
## 5: 4 Woodland-Gibson Road 1 100
## 6: 38 Woodland-Gibson Road 1 100
## AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
## 1: 88101 PM2.5 - Local Conditions 40900
## 2: 88101 PM2.5 - Local Conditions 40900
## 3: 88101 PM2.5 - Local Conditions 40900
## 4: 88101 PM2.5 - Local Conditions 40900
## 5: 88101 PM2.5 - Local Conditions 40900
## 6: 88101 PM2.5 - Local Conditions 40900
## CBSA_NAME STATE_CODE STATE COUNTY_CODE
## 1: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 2: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 3: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 4: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 5: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 6: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## COUNTY SITE_LATITUDE SITE_LONGITUDE
## 1: Yolo 38.66121 -121.7327
## 2: Yolo 38.66121 -121.7327
## 3: Yolo 38.66121 -121.7327
## 4: Yolo 38.66121 -121.7327
## 5: Yolo 38.66121 -121.7327
## 6: Yolo 38.66121 -121.7327
join$Date <- as.Date(join$Date, "%m/%d/%Y")
join <- join %>%
mutate(Year = year(join$Date))
head(join$Year)
## [1] 2004 2004 2004 2004 2004 2004
names(join)[5] <- 'PM2.5'
names(join)[8] <- 'SiteName'
names(join)[19] <- 'lat'
names(join)[20] <- 'lon'
names(join)
## [1] "Date" "Source" "Site ID"
## [4] "POC" "PM2.5" "UNITS"
## [7] "DAILY_AQI_VALUE" "SiteName" "DAILY_OBS_COUNT"
## [10] "PERCENT_COMPLETE" "AQS_PARAMETER_CODE" "AQS_PARAMETER_DESC"
## [13] "CBSA_CODE" "CBSA_NAME" "STATE_CODE"
## [16] "STATE" "COUNTY_CODE" "COUNTY"
## [19] "lat" "lon" "Year"
pal <- colorNumeric(c('red', 'blue'), c(2004, 2019))
# pal(join$Year)
leaflet(join)%>%
addProviderTiles('OpenStreetMap')%>%
addCircles(lat=~lat, lng=~lon, opacity = 1, fillOpacity = 1, radius = 100, color = ~pal(Year))
There are significantly more testing sites in the year 2019. Majority of the testing sites are clustered within bigger cities throughout California, but there are still some recording sites all throughout the state as well.
sum(is.na(join$PM2.5))
## [1] 0
sum(join$PM2.5 < 0)
## [1] 293
sum(join$PM2.5 < 0) / nrow(join)
## [1] 0.004037982
From our dataset we have no missing values for the daily mean concentration of PM 2.5, but we do have 293 values that are below 0. For microparticles floating in the air a mean concentration below 0 does not make sense as the minimal value would be 0. Finding the proportion of values of PM 2.5 being reported less than 0 over the total amount of records is relatively small being less than 0.5%.
#State Level
ggplot(join, aes(STATE, PM2.5, fill = factor(Year)))+
geom_boxplot()+
labs(title = 'Boxplot of PM 2.5 concentration within California', x = 'State',
fill = 'Year')
From comparing the two years within California we see that 2004 has more extraneous outliers compared to 2019 as well as a significantly higher range.
#State Level
ggplot(subset(join, Year == 2004), aes(x = Date, y = PM2.5))+
geom_line()+
labs(title = 'Time Series of PM 2.5 in 2004')
ggplot(subset(join, Year == 2019), aes(x = Date, y = PM2.5))+
geom_line()+
labs(title = 'Time Series of PM 2.5 in 2019')
In our time series visualization in the year 2004 we see that there is an unusual spike on PM 2.5 concentration within July-August reaching the highest at 250. However, the conentration tends to be in a cyclical pattern rising and falling with spikes every quarter of the year. The 2019 concentration of PM 2.5 for the state of california follows the same type of trend with cyclical patterns but with many spikes towards the last quarter of the year, with the largest spike in October reaching just above 120.
#State Level
join %>%
group_by(Year)%>%
summarise(min = min(PM2.5), mean = mean(PM2.5), max = max(PM2.5),
quantile = quantile(PM2.5, c(0.25, 0.50, 0.75)), q = c(0.25, 0.50, 0.75))
## # A tibble: 6 x 6
## # Groups: Year [2]
## Year min mean max quantile q
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2004 -0.1 13.1 251 6 0.25
## 2 2004 -0.1 13.1 251 10.1 0.5
## 3 2004 -0.1 13.1 251 16.3 0.75
## 4 2019 -2.2 7.74 121. 4 0.25
## 5 2019 -2.2 7.74 121. 6.5 0.5
## 6 2019 -2.2 7.74 121. 9.9 0.75
From the state level in California we see that in 2004 there are much higher concnetrations of PM 2.5 compared to the PM 2.5 concentrations in the year 2019. As the range of concentrations is significantly higher in 2004 at a little over 10 in PM 2.5 at the 3rd quantile compared to 2019’s concentration increase of 6 at the 3rd quantile. From this there is a decrease in the concentration of PM 2.5 from 2004 to 2019 overall in the state of California.
#County Level, Los Angeles County
countyLA <- join %>%
filter(COUNTY == 'Los Angeles')
head(countyLA)
## Date Source Site ID POC PM2.5 UNITS DAILY_AQI_VALUE SiteName
## 1: 2004-01-01 AQS 60370002 1 18.0 ug/m3 LC 63 Azusa
## 2: 2004-01-02 AQS 60370002 1 20.4 ug/m3 LC 68 Azusa
## 3: 2004-01-03 AQS 60370002 1 8.0 ug/m3 LC 33 Azusa
## 4: 2004-01-07 AQS 60370002 1 23.6 ug/m3 LC 75 Azusa
## 5: 2004-01-08 AQS 60370002 1 28.3 ug/m3 LC 85 Azusa
## 6: 2004-01-09 AQS 60370002 1 21.9 ug/m3 LC 72 Azusa
## DAILY_OBS_COUNT PERCENT_COMPLETE AQS_PARAMETER_CODE AQS_PARAMETER_DESC
## 1: 1 100 88101 PM2.5 - Local Conditions
## 2: 1 100 88101 PM2.5 - Local Conditions
## 3: 1 100 88101 PM2.5 - Local Conditions
## 4: 1 100 88101 PM2.5 - Local Conditions
## 5: 1 100 88101 PM2.5 - Local Conditions
## 6: 1 100 88101 PM2.5 - Local Conditions
## CBSA_CODE CBSA_NAME STATE_CODE STATE
## 1: 31080 Los Angeles-Long Beach-Anaheim, CA 6 California
## 2: 31080 Los Angeles-Long Beach-Anaheim, CA 6 California
## 3: 31080 Los Angeles-Long Beach-Anaheim, CA 6 California
## 4: 31080 Los Angeles-Long Beach-Anaheim, CA 6 California
## 5: 31080 Los Angeles-Long Beach-Anaheim, CA 6 California
## 6: 31080 Los Angeles-Long Beach-Anaheim, CA 6 California
## COUNTY_CODE COUNTY lat lon Year
## 1: 37 Los Angeles 34.1365 -117.9239 2004
## 2: 37 Los Angeles 34.1365 -117.9239 2004
## 3: 37 Los Angeles 34.1365 -117.9239 2004
## 4: 37 Los Angeles 34.1365 -117.9239 2004
## 5: 37 Los Angeles 34.1365 -117.9239 2004
## 6: 37 Los Angeles 34.1365 -117.9239 2004
ggplot(countyLA, aes(COUNTY, PM2.5, fill = factor(Year)))+
geom_boxplot()+
labs(title = 'Boxplot of PM 2.5 concentrations in LA county in 2004 and 2019', x = 'County'
, fill = 'Year')
Comparing the PM 2.5 concentrations in LA county between 2004 and 2019 there are higher concentrations within 2004 as their quantiles are greater than the quantiles in 2019. However, in 2019 has the more extranoues outliers with the greatest being 120.
#County Level, Los Angeles County
ggplot(subset(countyLA, Year == 2004), aes(x = Date, y = PM2.5))+
geom_line()+
labs(title = 'Time Series of PM 2.5 in LA county for 2004')
ggplot(subset(countyLA, Year == 2019), aes(x = Date, y = PM2.5))+
geom_line()+
labs(title = 'Time Series of PM 2.5 in LA county for 2019')
From our time series visual we see a closer look of the concentrations between 2004 and 2019 as in 2004 the range is just above 60 compared to 2019’s range being mostly under a concentration just under 25. Both years follow a cyclical pattern but 2004 has the more growing spikes of PM 2.5. The two outliers in 2019 distort our visual as the spread of the concentration is rather consistant and much lower than the concentrations in 2004.
#County Level, Los Angeles County
countyLA %>%
group_by(Year)%>%
summarise(min = min(PM2.5), mean = mean(PM2.5), max = max(PM2.5),
quantile = quantile(PM2.5, c(0.25, 0.50, 0.75)), q = c(0.25, 0.50, 0.75))
## # A tibble: 6 x 6
## # Groups: Year [2]
## Year min mean max quantile q
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2004 0.1 17.1 75.6 10.5 0.25
## 2 2004 0.1 17.1 75.6 14.7 0.5
## 3 2004 0.1 17.1 75.6 20.4 0.75
## 4 2019 -0.5 10.2 121. 6.4 0.25
## 5 2019 -0.5 10.2 121. 9.5 0.5
## 6 2019 -0.5 10.2 121. 12.9 0.75
From our summary statistics we se that in 2004 the max concentration is much lower than 2019’s maximum but 2004 has the greatest range within its quantiles. From this there is a decrease in PM 2.5 concentrations from 2004 to 2019 at the Los Angeles county level.
#Site Level, Los Angeles
site <- join %>%
filter(SiteName == "Los Angeles-North Main Street")
head(site, 12)
## Date Source Site ID POC PM2.5 UNITS DAILY_AQI_VALUE
## 1: 2004-01-01 AQS 60371103 1 42.1 ug/m3 LC 117
## 2: 2004-01-02 AQS 60371103 1 25.3 ug/m3 LC 79
## 3: 2004-01-03 AQS 60371103 1 4.8 ug/m3 LC 20
## 4: 2004-01-07 AQS 60371103 1 28.1 ug/m3 LC 85
## 5: 2004-01-08 AQS 60371103 1 36.2 ug/m3 LC 103
## 6: 2004-01-09 AQS 60371103 1 29.6 ug/m3 LC 88
## 7: 2004-01-10 AQS 60371103 1 8.9 ug/m3 LC 37
## 8: 2004-01-11 AQS 60371103 1 15.2 ug/m3 LC 58
## 9: 2004-01-12 AQS 60371103 1 49.7 ug/m3 LC 136
## 10: 2004-01-13 AQS 60371103 1 19.9 ug/m3 LC 67
## 11: 2004-01-15 AQS 60371103 1 20.4 ug/m3 LC 68
## 12: 2004-01-16 AQS 60371103 1 36.6 ug/m3 LC 104
## SiteName DAILY_OBS_COUNT PERCENT_COMPLETE
## 1: Los Angeles-North Main Street 1 100
## 2: Los Angeles-North Main Street 1 100
## 3: Los Angeles-North Main Street 1 100
## 4: Los Angeles-North Main Street 1 100
## 5: Los Angeles-North Main Street 1 100
## 6: Los Angeles-North Main Street 1 100
## 7: Los Angeles-North Main Street 1 100
## 8: Los Angeles-North Main Street 1 100
## 9: Los Angeles-North Main Street 1 100
## 10: Los Angeles-North Main Street 1 100
## 11: Los Angeles-North Main Street 1 100
## 12: Los Angeles-North Main Street 1 100
## AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
## 1: 88101 PM2.5 - Local Conditions 31080
## 2: 88101 PM2.5 - Local Conditions 31080
## 3: 88101 PM2.5 - Local Conditions 31080
## 4: 88101 PM2.5 - Local Conditions 31080
## 5: 88101 PM2.5 - Local Conditions 31080
## 6: 88101 PM2.5 - Local Conditions 31080
## 7: 88101 PM2.5 - Local Conditions 31080
## 8: 88101 PM2.5 - Local Conditions 31080
## 9: 88101 PM2.5 - Local Conditions 31080
## 10: 88101 PM2.5 - Local Conditions 31080
## 11: 88101 PM2.5 - Local Conditions 31080
## 12: 88101 PM2.5 - Local Conditions 31080
## CBSA_NAME STATE_CODE STATE COUNTY_CODE
## 1: Los Angeles-Long Beach-Anaheim, CA 6 California 37
## 2: Los Angeles-Long Beach-Anaheim, CA 6 California 37
## 3: Los Angeles-Long Beach-Anaheim, CA 6 California 37
## 4: Los Angeles-Long Beach-Anaheim, CA 6 California 37
## 5: Los Angeles-Long Beach-Anaheim, CA 6 California 37
## 6: Los Angeles-Long Beach-Anaheim, CA 6 California 37
## 7: Los Angeles-Long Beach-Anaheim, CA 6 California 37
## 8: Los Angeles-Long Beach-Anaheim, CA 6 California 37
## 9: Los Angeles-Long Beach-Anaheim, CA 6 California 37
## 10: Los Angeles-Long Beach-Anaheim, CA 6 California 37
## 11: Los Angeles-Long Beach-Anaheim, CA 6 California 37
## 12: Los Angeles-Long Beach-Anaheim, CA 6 California 37
## COUNTY lat lon Year
## 1: Los Angeles 34.06659 -118.2269 2004
## 2: Los Angeles 34.06659 -118.2269 2004
## 3: Los Angeles 34.06659 -118.2269 2004
## 4: Los Angeles 34.06659 -118.2269 2004
## 5: Los Angeles 34.06659 -118.2269 2004
## 6: Los Angeles 34.06659 -118.2269 2004
## 7: Los Angeles 34.06659 -118.2269 2004
## 8: Los Angeles 34.06659 -118.2269 2004
## 9: Los Angeles 34.06659 -118.2269 2004
## 10: Los Angeles 34.06659 -118.2269 2004
## 11: Los Angeles 34.06659 -118.2269 2004
## 12: Los Angeles 34.06659 -118.2269 2004
ggplot(site, aes(SiteName, PM2.5, fill = factor(Year)))+
geom_boxplot()+
labs(title = 'Boxplot of PM 2.5 concentrations in LA site in 2004 and 2019', x = 'Site Name',
fill = 'Year')
With out boxplot for the LA site we see that the concentration for PM 2.5 has greater values in 2004 as the quatiles are significantly greater than the quantiles in 2019. In 2004 there are also more extraneous outliers with the greatest being aorund 75.
#Site Level, Los Angeles
ggplot(subset(site, Year == 2004), aes(x = Date, y = PM2.5))+
geom_line()+
labs(title = 'Time Series of PM 2.5 in LA site for 2004')
ggplot(subset(site, Year == 2019), aes(x = Date, y = PM2.5))+
geom_line()+
labs(title = 'Time Series of PM 2.5 in LA site for 2019')
In our time series visual we see that the PM 2.5 concentration in 2004 is well spread with the highest peak happening in March and then again in October. Both years follow a cyclical pattern, but in 2019 has a tighter cyclical pattern with sharper lines meaning the PM 2.5 concentration changes rapidly in each day in the LA site for 2019. The greatest peak in 2019 happens in September being greater than 43.
#Site Level, Los Angeles
join %>%
filter(SiteName == "Los Angeles-North Main Street")%>%
group_by(Year)%>%
summarise(min = min(PM2.5), mean = mean(PM2.5), max = max(PM2.5),
quantile = quantile(PM2.5, c(0.25, 0.50, 0.75)), q = c(0.25, 0.50, 0.75))
## # A tibble: 6 x 6
## # Groups: Year [2]
## Year min mean max quantile q
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2004 2 20.1 75 12.8 0.25
## 2 2004 2 20.1 75 16.8 0.5
## 3 2004 2 20.1 75 23.4 0.75
## 4 2019 1.9 11.7 43.5 7.9 0.25
## 5 2019 1.9 11.7 43.5 10.9 0.5
## 6 2019 1.9 11.7 43.5 14.5 0.75
From our summary statistics we see that 2004 has the greatest range for its concentration quantile is much greater than 2019’s. We see that there is a decrease of the concentration of PM 2.5 from 2004 to 2019 at the Los Angeles site level.